GSE209552.# Define global constant
GSE_NUM <- "GSE209552"
To setup GEOmetadb Tutorial, follow GEOmetadb Tutorial Chapter 3
# Install required package if not installed
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
if (!requireNamespace("GEOmetadb", quietly = TRUE))
BiocManager::install("GEOmetadb")
if (!requireNamespace("DBI", quietly = TRUE))
install.packages("DBI")
if (!requireNamespace("RSQLite", quietly = TRUE))
install.packages("RSQLite")
if (!requireNamespace("dplyr", quietly = TRUE))
install.packages("dplyr")
if (!requireNamespace("ggplot2", quietly = TRUE))
install.packages("ggplot2")
# Attach packages
library(GEOmetadb)
## Loading required package: R.utils
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.2 (2022-06-13 22:00:14 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.27.1 (2025-05-02 21:00:05 UTC) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
##
## throw
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, load, save
## R.utils v2.13.0 (2025-02-24 21:20:02 UTC) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, isOpen, nullfile, parse, use, warnings
## Loading required package: RSQLite
library(DBI)
library(RSQLite)
# Download SQLite database if not exist
if (!file.exists('GEOmetadb.sqlite')) getSQLiteFile()
file.info('GEOmetadb.sqlite')
# Connect to database
con <- dbConnect(SQLite(), 'GEOmetadb.sqlite')
Download data, follow Identifier Mapping Tutorial. The helper function defined below is modified based on Identifier Mapping Tutorial’s helper function to avoid duplicated download.
# Define helper function
fetch_geo_supp <- function(gse, destdir = "data") {
# Calculate the actual path where GEOquery puts the files
target_path <- file.path(destdir, gse)
# Check if directory exists AND contains files
if (dir.exists(target_path) && length(list.files(target_path)) > 0) {
message(paste("Data found at", target_path, "- Skipping download."))
return(invisible(target_path))
}
# If missing or empty, proceed with download
message(paste("Downloading", gse, "to", destdir, "..."))
dir.create(destdir, showWarnings = FALSE, recursive = TRUE)
GEOquery::getGEOSuppFiles(GEO = gse, baseDir = destdir, makeDirectory = TRUE)
invisible(target_path)
}
# Download
fetch_geo_supp(gse = GSE_NUM)
## Data found at data/GSE209552 - Skipping download.
This GSE contain both scRNA-seq and bulk RNA-seq sample. Removal of bulk RNA-seq is needed to avoid significant batch effect due to their technical difference. Although the number of sample is reduced to 17, it’s still sufficient amount.
Refer to the original paper’s
method section, only the scRNA-seq was done by
Illumina NextSeq6000 platform, GPL24676.
We can use this clue to exclude all bulk RNA-seq
Since all GSMs of this study are packed in one
GSE209552_RAW.tar file in GEO, we can’t indicate which
platform to include/exclude at download step. Thus we will construct a
list of GPL24676 GSMs for downstream reference by query
GEOmetabd.
# Define desired platform
scRNA_platform <- "GPL24676"
# Construct the query
# We join the 'link' table (gse_gsm) with the 'sample' table (gsm)
# to check the platform (gpl) column.
sql_filter <- paste0(
"SELECT t1.gsm ",
"FROM gse_gsm t1 ",
"JOIN gsm t2 ON t1.gsm = t2.gsm ",
"WHERE t1.gse = '", GSE_NUM, "' ",
"AND t2.gpl = '", scRNA_platform, "'"
)
# Get the list of pure scRNA-seq samples
sc_gsm_list <- dbGetQuery(con, sql_filter)$gsm
# View results
print(paste("Found", length(sc_gsm_list), "scRNA-seq samples."))
## [1] "Found 17 scRNA-seq samples."
sc_gsm_list
## [1] "GSM6376803" "GSM6376804" "GSM6376805" "GSM6376806" "GSM6376807"
## [6] "GSM6376811" "GSM6376812" "GSM6376813" "GSM6376814" "GSM6376815"
## [11] "GSM6376816" "GSM6376817" "GSM6376818" "GSM6376819" "GSM6376820"
## [16] "GSM6376821" "GSM6376822"
# Install Seurat if not
if (!requireNamespace("Seurat", quietly = TRUE))
install.packages("Seurat")
The raw count data was stored in GSE209552_RAW.tar file,
thus need to be unpacked using untar() function.
# Attach packages
library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built under R 4.5.0 but the current version is
## 4.5.2; it is recomended that you reinstall 'SeuratObject' as the ABI
## for R may have changed
##
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
##
## intersect, t
library(data.table)
library(Matrix)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
# Define data paths
tar_file <- "data/GSE209552/GSE209552_RAW.tar"
extract_dir <- "data/GSE209552/raw_files"
# Define unpack helper function
unpack_geo_tar <- function(tar_path, dest_dir) {
if (dir.exists(dest_dir) && length(list.files(dest_dir)) > 0) {
message(paste("📂 Extracted files found at", dest_dir, "- Skipping unpack."))
return(invisible(dest_dir))
}
if (!file.exists(tar_path)) {
stop(paste("❌ Tar file not found:", tar_path))
}
message(paste("📦 Unpacking", basename(tar_path), "..."))
dir.create(dest_dir, showWarnings = FALSE, recursive = TRUE)
untar(tar_path, exdir = dest_dir)
message("✅ Unpacking complete.")
return(invisible(dest_dir))
}
# Run the unpacker
unpack_geo_tar(tar_file, extract_dir)
## 📂 Extracted files found at data/GSE209552/raw_files - Skipping unpack.
The unpacked file is very messy, contain all 10x genomics scRNA-seq files, bulk RNA-seq files, subsetted Transposable Elements file, clustered files, and normalized files. Among those, only 10x genomics is what we need.
To extract, we utilize previously defined sc_gsm_list,
loop over it, exclude any file contain _TE_, indicate
Transposable Elements, only extract files end with either
barcode, matrix, or features.
Lastly, trim the file name to barcode, matrix,
features only, to meet the expected naming format of Seurat’s Read10X()
function.
# Reorganize all scRNA-seq 10x files in to a new directory
dest_dir <- "data/GSE209552/10x_organized" # Where they should go
dir.create(dest_dir, recursive = TRUE, showWarnings = FALSE)
# --- MAIN LOOP ---
message(paste("🚀 Organizing files for", length(sc_gsm_list), "samples..."))
## 🚀 Organizing files for 17 samples...
skipped_count <- 0
moved_count <- 0
for (gsm in sc_gsm_list) {
# 1. Define the specific destination path for this sample
sample_subdir <- file.path(dest_dir, gsm)
# 2. CHECK: Does it already exist?
if (dir.exists(sample_subdir) && length(list.files(sample_subdir)) >= 3) {
skipped_count <- skipped_count + 1
next
}
# 3. Create directory
dir.create(sample_subdir, showWarnings = FALSE)
# 4. Find source files (Exclude the 'TE' files)
pattern <- paste0("^", gsm)
files_to_move <- list.files(extract_dir, pattern = pattern, full.names = TRUE)
files_to_move <- files_to_move[!grepl("_TE_", files_to_move)]
# Safety Checks
if (length(files_to_move) == 0) {
warning(paste("⚠️ Skipping", gsm, "- No source files found."))
next
}
# 5. MOVE & RENAME (The Standardize Step)
# We identify specific files to ensure we rename them correctly for Seurat
f_matrix <- files_to_move[grep("matrix", files_to_move, ignore.case = TRUE)]
f_barcodes <- files_to_move[grep("barcodes", files_to_move, ignore.case = TRUE)]
f_features <- files_to_move[grep("features|genes", files_to_move, ignore.case = TRUE)]
# Verify we found exactly 1 of each before moving
if (length(f_matrix) == 1 && length(f_barcodes) == 1 && length(f_features) == 1) {
# Rename Matrix -> matrix.mtx.gz
file.rename(f_matrix, file.path(sample_subdir, "matrix.mtx.gz"))
# Rename Barcodes -> barcodes.tsv.gz
file.rename(f_barcodes, file.path(sample_subdir, "barcodes.tsv.gz"))
# Rename Genes/Features -> features.tsv.gz
# (Note: Seurat prefers 'features.tsv' over 'genes.tsv')
file.rename(f_features, file.path(sample_subdir, "features.tsv.gz"))
message(paste(" ✅ Standardized & Moved:", gsm))
moved_count <- moved_count + 1
} else {
warning(paste(" ⚠️ Skipping", gsm, "- Could not find unique 10x triplet to standardize."))
}
}
# --- SUMMARY ---
message("🎉 Organization Complete.")
## 🎉 Organization Complete.
message(paste(" 🔹 Samples moved & standardized:", moved_count))
## 🔹 Samples moved & standardized: 0
message(paste(" 🔸 Samples skipped (already done):", skipped_count))
## 🔸 Samples skipped (already done): 17
Loading of expression data is done by iteratively call Seurat’s Read10X()
and CreateSeuratObject() function over all GSMs. They
cooperatively first read a 10X genomics structured file and create a
Seurat Object, which in the basic unit for any data
manipulation in Seurat.
# Get the list of sample subdirectories
sample_dirs <- list.dirs(dest_dir, recursive = FALSE)
# Define a list storing seurat obj of all GSMs
seurat_list <- list()
message(paste("🚀 Loading", length(sample_dirs), "samples..."))
## 🚀 Loading 17 samples...
for (dir in sample_dirs) {
# Extract GSM ID from the folder name (e.g., "GSM6385438")
gsm_id <- basename(dir)
# A. Read 10x Data
# Because you renamed the files, Read10X finds them automatically!
counts <- Read10X(data.dir = dir)
# B. Create Seurat Object
seurat_list[[gsm_id]] <- CreateSeuratObject(
counts = counts,
project = gsm_id,
min.cells = 0,
min.features = 0
)
message(paste(" ✅ Loaded:", gsm_id))
}
## ✅ Loaded: GSM6376803
## ✅ Loaded: GSM6376804
## ✅ Loaded: GSM6376805
## ✅ Loaded: GSM6376806
## ✅ Loaded: GSM6376807
## ✅ Loaded: GSM6376811
## ✅ Loaded: GSM6376812
## ✅ Loaded: GSM6376813
## ✅ Loaded: GSM6376814
## ✅ Loaded: GSM6376815
## ✅ Loaded: GSM6376816
## ✅ Loaded: GSM6376817
## ✅ Loaded: GSM6376818
## ✅ Loaded: GSM6376819
## ✅ Loaded: GSM6376820
## ✅ Loaded: GSM6376821
## ✅ Loaded: GSM6376822
length(seurat_list)
## [1] 17
However, the metadata (contain condition information) need to fetched
from GEOmetadb,
and extract from GSM table’s
characteristics_ch1 column, and then insert into
corresponding Seurat object’s metadata layer.
# Load GSMs metadata from GEOmetadb
gsm_list_string <- paste(paste0("'", sc_gsm_list, "'"), collapse = ",")
sql <- paste0(
"SELECT gsm, title, characteristics_ch1 ",
"FROM gsm ",
"WHERE gsm IN (", gsm_list_string, ")"
)
metadata <- dbGetQuery(con, sql)
# Merge the condition metadata to corresponding Seurat obj's metadata
metadata$Condition <- sub(".*condition:\\s*([^;]+).*", "\\1", metadata$characteristics_ch1, ignore.case = TRUE)
# Loop and assign
for (gsm in names(seurat_list)) {
# Find the condition for this GSM
val <- metadata$Condition[metadata$gsm == gsm]
# Assign to Seurat object (handling missing values)
if (length(val) > 0) {
seurat_list[[gsm]]$Condition <- val
} else {
seurat_list[[gsm]]$Condition <- "Unknown"
}
}
# Final Merge all samples into one seurat obj
combined_seurat <- merge(
x = seurat_list[[1]],
y = seurat_list[-1],
add.cell.ids = names(seurat_list),
project = "GSE209552"
)
# Verify
table(combined_seurat$Condition)
##
## control tbi
## 10666 13955
Plot the following statistics:
library(ggplot2)
library(patchwork)
theme_set(theme_classic(base_size = 26))
# 1. Coverage (nFeature_RNA)
p_cov <- VlnPlot(
combined_seurat,
features = "nFeature_RNA",
group.by = "Condition",
pt.size = 0
) +
ggtitle("Coverage") +
ylab("nFeature_RNA")
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
# 2. Depth (nCount_RNA) with adjusted axis
p_depth <- VlnPlot(
combined_seurat,
features = "nCount_RNA",
group.by = "Condition",
pt.size = 0
) +
ggtitle("Depth") +
ylab("nCount_RNA") +
scale_y_continuous(limits = c(0, 50000))
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
# Combine them to recreate p_metrics
p_metrics <- p_cov | p_depth
# Number of cells among conditions
meta_data <- combined_seurat@meta.data
# A. Bar plot: Total Number of Cells per Condition
p_cells <- ggplot(meta_data, aes(x = Condition, fill = Condition)) +
geom_bar(color = "black") +
geom_text(stat = 'count', aes(label = after_stat(count)), vjust = -0.5) +
theme_classic() +
labs(title = "Total Cells per Group", y = "Number of Cells", x = NULL) +
theme(legend.position = "none")
# Number of samples among conditions
p_samples <- meta_data %>%
group_by(Condition) %>%
summarize(Sample_Count = n_distinct(orig.ident)) %>%
ggplot(aes(x = Condition, y = Sample_Count, fill = Condition)) +
geom_bar(stat = "identity", color = "black") +
geom_text(aes(label = Sample_Count), vjust = -0.5) +
theme_classic() +
labs(title = "Total Samples (GSMs) per Group", y = "Number of Samples (GSMs)", x = NULL) +
theme(legend.position = "none")
# Merge plots
final_plot <- p_metrics / (p_samples | p_cells)
# Adjust font size
final_plot <- final_plot & theme_classic(base_size = 16)
final_plot
## Warning: Removed 352 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
Figure 1: Overall statistics of dataset
GSE209552 grouped by condition (Control v.s. tbi). The
upper-left panel indicate the number of unique identifier; The
upper-right panel indicate the total amount of identifier, the maximum
amount of identifier was limited up to 50000 for the sake of aesthetics;
The lower-left panel indicate number of samples (GSMs); The lower-right
panel indicate the number of cells”; X-axis of all four panel indicate
condition.
Generally, control group have higher depth and coverage, but less sample and cells.
Then plot the statistics below per sample (GSM):
# Define the vertical line style once to keep it consistent
vertical_line <- geom_vline(xintercept = 5.5, linetype = "dashed", color = "red", size = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# 1. Coverage (nFeature_RNA)
p_cov <- VlnPlot(
combined_seurat,
features = "nFeature_RNA",
group.by = "orig.ident",
pt.size = 0
) +
ggtitle("Coverage") +
ylab("nFeature_RNA") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none",
axis.title.x = element_blank() # Often cleaner to remove x-label if text is rotated
) + vertical_line
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
# 2. Depth (nCount_RNA) with adjusted axis
p_depth <- VlnPlot(
combined_seurat,
features = "nCount_RNA",
group.by = "orig.ident",
pt.size = 0
) +
ggtitle("Depth") +
ylab("nCount_RNA") +
scale_y_continuous(limits = c(0, 50000)) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none",
axis.title.x = element_blank()
) + vertical_line
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
# A. Bar plot: Total Number of Cells per Condition
meta_data <- combined_seurat@meta.data # Ensure meta_data is defined
p_cells <- ggplot(meta_data, aes(x = orig.ident, fill = orig.ident)) +
geom_bar(color = "black") +
geom_text(stat = 'count', aes(label = after_stat(count)), vjust = -0.5) +
theme_classic() +
labs(title = "Total Cells per Sample", y = "Number of Cells", x = NULL) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
) +
coord_fixed(ratio = 0.0005) +
scale_y_continuous(limits = c(0, 6000)) + vertical_line
# Merge plots
final_plot <- p_cov / p_depth / p_cells +
plot_layout(heights = c(1, 1, 1))
# Adjust font size (and re-apply theme elements if needed by the base theme)
final_plot <- final_plot & theme_classic(base_size = 16) &
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
)
final_plot
## Warning: Removed 352 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
Figure 2: Overall statistics per sample (GSM). The top
panel indicate number of unique identifier; The middle panel indicate
number of total identifier; The bottom panel indicate number of cells;
X-axis of all three panel indicate samples (GSMs); The vertial red dash
line sepeate control (left side of red line), and tbi (right side of red
line).
In general, the sequence depth and coverage is more consistence among
control group. Tbi group has more variated depth and coverage with
GSM637681 being the sample of lowest depths and coverage.
The number of cells within different sample also varies, from highest
3708 (GSM6376822) to lowest 358 (GSM6376820).
This imbalance in cell number may result in downstream analysis dominate
by sample which contain more cells.
Another common quality control step in scRNA-seq is to remove
damaged cells. When damaged cells were sequenced or damaged
during sequencing, the expression intensity will scaled down due to
leaked RNA, thus significantly bias downstream clustering, annotation,
and differential gene expression analysis. A common method to conduct
this filtering is using
mitochondrial transcript percentage, based on this idea
that if a cell has leaked RNA, the
mitochondrial transcript percentage will be relatively
higher since mitochondria is relatively large and less likely to
leak.
# Use seurat PercentageFeatureSet() function
# Create a new entry in meta.data layer storing mitochondrial transcript percentage
combined_seurat[["percent.mt"]] <- PercentageFeatureSet(combined_seurat, pattern = "^MT-")
VlnPlot(combined_seurat, features = c("percent.mt")) +
theme(
# Rotate x-axis labels 45 degrees so they don't overlap
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none" # Hide legend since x-axis already shows the names
) + vertical_line
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
Figure 3: Mitochondrial transcript percentage grouped
by GSMs. The Y-axis indicate mitochondrial transcript percentage; The
X-axis indicate GSM; The vertial red dash line sepeate control (left
side of red line), and tbi (right side of red line).
The mitochondrial transcript percentage aligned with previous depth
and coverage among samples, where control group tend to be more
consistent. A hypothesis is that Traumatic Brain Injury (TBI) is a broad
term that include various brain damage result from external force, and
the intensity and exact pathology may vary among different tbi sample,
reflected by sequence depth and coverage. Noticeably,
GSM6376814 has relatively higher mitochondrial transcript
percentage, and recall that it’s also the sample having lowest depth and
coverage, which might imply existing technical variance that could bias
downstream analysis.